Prerequisites

Load required packages

library(tidyverse)
library(dplyr)
library(ggplot2)
library(rtweet)
library(readr)
library(DataExplorer)

Dataset

Import processed data, which can be found here.

#read preprocessed data
wines <- read.csv(file = '../data/processed_data/wines.csv')

Get sample of dataset

#set seed value to birthday of Ricardo Rodriguez, American wrestler and ring announcer and Dr. Reinaldo (Rei) Sanchez-Arias
set.seed(19630217)

#set percentage to test with for simplicity, if needed
percentage <- 5
wine_sample<- sample_n(wines, percentage/100*nrow(wines))

Split Taster data into different Data Frame

tasters <- wines %>%
  select(taster_name, taster_twitter_handle) %>% unique()
tasters

Drop taster_twitter_handle in wines dataframe

wines <- wines %>%
  select(-taster_twitter_handle)
head(wines)

Add Reviewer profile info

Each reviewer has there own bias. To offset that we made a “profile” for each reviewer which includes characteristics like: avg_points, sd_points, and var_points

taster_rating_profile <- wines %>%
  group_by(taster_name) %>%
  summarize(
    avg_points = mean(points),
    sd_points = sd(points),
    var_points = var(points),
    reviews = n()
  )

tasters <- inner_join(tasters, taster_rating_profile, by = "taster_name")
head(tasters)

Add Rating Classification

Add following classification to wine dataset as found on the website:

Category Rating Description
Classic 98-100 The pinnacle of quality.
Superb 94-97 A great achievement.
Excellent 90-93 Highly recommended.
Very Good 87-89 Often good value; well recommended.
Good 83-86 Suitable for everyday consumption; often good value.
Acceptable 80-82 Can be employed in casual, less-critical circumstances
# function to add rating
rating_category <- function(points){
  if(points>=98){
    return("Classic")
  }
  else if (points>=94){
    return("Superb")
  }
  else if(points>=90){
    return("Excellent")
  }
  else if(points>=87){
    return("Very Good")
  }
  else if(points>=83){
    return("Good")
  }
  else{
    return("Acceptable")
  }
}

wines<- wines %>%
  rowwise() %>%
  mutate(rating_category = rating_category(points))
head(wines)

Add Adjusted Points

Since, each reviewer has a different bias we created a normalized metric, norm_points, by looking at the number of standard deviatioins a wine is from the reviewer’s avg_points. This gives use a more accurate representation of which which wines are better than the rest.

normalize_points <- function(data){
  left_join(data, tasters, by = "taster_name")%>%
    rowwise() %>%
    mutate(norm_points = (points-avg_points)/sd_points) %>%
    select(-avg_points, -sd_points, -var_points, -taster_twitter_handle, -reviews)
}

wines <- normalize_points(wines)
head(wines) 

Data Sanitation

Vintage seems to have year 7200

wines <- wines %>%
  filter(vintage<2019)

Data Exploration

Univariate Exploration

Correlation price by points, using DataExplorer library which can be found here

# TODO: IZZY

Alcohol Amount

# TODO: IZZY

Category

# TODO: IZZY

Vintage

Count wines per year (Note: Data has been sanitized)

wines %>%
  group_by(vintage) %>%
  summarize(count = n())
Grouping rowwise data frame strips rowwise nature
wines %>%
  ggplot() +
  geom_bar(mapping = aes(x=vintage)) +
  x_lim(1900)
Error in x_lim(1900) : could not find function "x_lim"

Winery

# TODO: Osaki

Province

# TODO: OSAKI

Price

# TODO: OASKI (This is not producing correct results)
wines %>%
    summarize(avg_price = mean(price, na.rm=TRUE), 
              sd_price = sd(price, na.rm=TRUE),
              lowest_price = min(price, na.rm=TRUE),
              highest_price = max(price,na.rm=TRUE))

Points

# TODO: OASKI (This is not producing correct results)
wines %>%
    summarize(avg_points = mean(points, na.rm=TRUE), 
              sd_points = sd(points, na.rm=TRUE),
              lowest_points = min(points, na.rm=TRUE),
              highest_points = max(points,na.rm=TRUE))

Points distribution by Reviewer

wines %>%
  ggplot() +
  geom_boxplot(aes(y=taster_name, x=points)) +
  geom_vline(xintercept = mean(wines$points))

Multivariate Exploration

Price by Points

Notice the data is “stacked” and the socres range from 80-100

wines %>% 
  ggplot() +
  geom_point(mapping = (aes(x = points, y = price)), na.rm = T, alpha = 0.15) +
  labs(title = "Price by Points", x = "Points", y = "Price")

TODO: IZZY (Why did we log this?)

wines %>% 
  ggplot() +
  geom_point(mapping = (aes(x = points, y = log(price))), na.rm = T, alpha = 0.15) +
  labs(title = "log(Price) by Points", x = "Points", y = "log(Price)")

Data Analysis

#Find the best province for wine using the average points across the 1,000 samples #drop the descriptions or just select price? set points to max(points)

best_province <- wine_sample %>% 
  group_by(province, points) %>% 
  filter(points > 88.669)
best_province  

Best wine, by variety

#wine_best_variety <- 
wines %>% 
  group_by(variety) %>% 
  summarise(mean_points = mean(points)) %>% 
  arrange(desc(mean_points)) 
  
user_price <- readline(prompt = "How much are you willing to spend on a bottle?")
user_price <- as.integer(user_price)

wines %>% 
  filter(price <= user_price) %>% 
  arrange(desc(points)) %>% 
  select(title, price, points)

Conclusion

LS0tCnRpdGxlOiAiRXhwbG9yaW5nIGFuZCBBbmFseWl6aW5nIFdpbmUgRW50aHVzaWFzdCBSZXZpZXdzIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgojIFByZXJlcXVpc2l0ZXMKCkxvYWQgcmVxdWlyZWQgcGFja2FnZXMKYGBge3IsIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KGRwbHlyKQpsaWJyYXJ5KGdncGxvdDIpCmxpYnJhcnkocnR3ZWV0KQpsaWJyYXJ5KHJlYWRyKQpsaWJyYXJ5KERhdGFFeHBsb3JlcikKYGBgCgojIERhdGFzZXQKCkltcG9ydCBwcm9jZXNzZWQgZGF0YSwgd2hpY2ggY2FuIGJlIGZvdW5kIFtoZXJlXShodHRwczovL2dpdGh1Yi5jb20vQzRyYnluM200bi93aW5lX3Jldmlld3NfZGF0YV9hbmFseXNpcy9ibG9iL21hc3Rlci9kYXRhL3Byb2Nlc3NlZF9kYXRhL3ByZXByb2Nlc3Npbmcucm1kKS4KCmBgYHtyfQojcmVhZCBwcmVwcm9jZXNzZWQgZGF0YQp3aW5lcyA8LSByZWFkLmNzdihmaWxlID0gJy4uL2RhdGEvcHJvY2Vzc2VkX2RhdGEvd2luZXMuY3N2JykKYGBgCgpHZXQgc2FtcGxlIG9mIGRhdGFzZXQKYGBge3J9CiNzZXQgc2VlZCB2YWx1ZSB0byBiaXJ0aGRheSBvZiBSaWNhcmRvIFJvZHJpZ3VleiwgQW1lcmljYW4gd3Jlc3RsZXIgYW5kIHJpbmcgYW5ub3VuY2VyIGFuZCBEci4gUmVpbmFsZG8gKFJlaSkgU2FuY2hlei1BcmlhcwpzZXQuc2VlZCgxOTYzMDIxNykKCiNzZXQgcGVyY2VudGFnZSB0byB0ZXN0IHdpdGggZm9yIHNpbXBsaWNpdHksIGlmIG5lZWRlZApwZXJjZW50YWdlIDwtIDUKd2luZV9zYW1wbGU8LSBzYW1wbGVfbih3aW5lcywgcGVyY2VudGFnZS8xMDAqbnJvdyh3aW5lcykpCmBgYAoKIyMjIFNwbGl0IFRhc3RlciBkYXRhIGludG8gZGlmZmVyZW50IERhdGEgRnJhbWUKCmBgYHtyfQp0YXN0ZXJzIDwtIHdpbmVzICU+JQogIHNlbGVjdCh0YXN0ZXJfbmFtZSwgdGFzdGVyX3R3aXR0ZXJfaGFuZGxlKSAlPiUgdW5pcXVlKCkKdGFzdGVycwpgYGAKCkRyb3AgYHRhc3Rlcl90d2l0dGVyX2hhbmRsZWAgaW4gd2luZXMgZGF0YWZyYW1lCgpgYGB7cn0Kd2luZXMgPC0gd2luZXMgJT4lCiAgc2VsZWN0KC10YXN0ZXJfdHdpdHRlcl9oYW5kbGUpCmhlYWQod2luZXMpCmBgYAojIyBBZGQgUmV2aWV3ZXIgcHJvZmlsZSBpbmZvCgpFYWNoIHJldmlld2VyIGhhcyB0aGVyZSBvd24gYmlhcy4gVG8gb2Zmc2V0IHRoYXQgd2UgbWFkZSBhICJwcm9maWxlIiBmb3IgZWFjaCByZXZpZXdlciB3aGljaCBpbmNsdWRlcyBjaGFyYWN0ZXJpc3RpY3MgbGlrZTogYGF2Z19wb2ludHNgLCBgc2RfcG9pbnRzYCwgYW5kIGB2YXJfcG9pbnRzYApgYGB7cn0KdGFzdGVyX3JhdGluZ19wcm9maWxlIDwtIHdpbmVzICU+JQogIGdyb3VwX2J5KHRhc3Rlcl9uYW1lKSAlPiUKICBzdW1tYXJpemUoCiAgICBhdmdfcG9pbnRzID0gbWVhbihwb2ludHMpLAogICAgc2RfcG9pbnRzID0gc2QocG9pbnRzKSwKICAgIHZhcl9wb2ludHMgPSB2YXIocG9pbnRzKSwKICAgIHJldmlld3MgPSBuKCkKICApCgp0YXN0ZXJzIDwtIGlubmVyX2pvaW4odGFzdGVycywgdGFzdGVyX3JhdGluZ19wcm9maWxlLCBieSA9ICJ0YXN0ZXJfbmFtZSIpCmhlYWQodGFzdGVycykKYGBgCiMjIyBBZGQgUmF0aW5nIENsYXNzaWZpY2F0aW9uCgpBZGQgZm9sbG93aW5nIGNsYXNzaWZpY2F0aW9uIHRvIHdpbmUgZGF0YXNldCBhcyBmb3VuZCBvbiB0aGUgW3dlYnNpdGVdKGh0dHBzOi8vd3d3LndpbmVtYWcuY29tLzIwMTAvMDQvMDkveW91LWFza2VkLWhvdy1pcy1hLXdpbmVzLXNjb3JlLWRldGVybWluZWQvKToKCnxDYXRlZ29yeSAgfCBSYXRpbmcgIHwgRGVzY3JpcHRpb24gICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHwKfC0tLS0tLS0tLS18LS0tLS0tLS0tfC0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tfAp8Q2xhc3NpYyAgIHwJOTgtMTAwIHwgVGhlIHBpbm5hY2xlIG9mIHF1YWxpdHkuICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHwKfFN1cGVyYiAgICB8CTk0LTk3CSB8IEEgZ3JlYXQgYWNoaWV2ZW1lbnQuICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICB8CnxFeGNlbGxlbnQgfAk5MC05MwkgfCBIaWdobHkgcmVjb21tZW5kZWQuICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgfAp8VmVyeSBHb29kIHwgIDg3LTg5CSB8IE9mdGVuIGdvb2QgdmFsdWU7IHdlbGwgcmVjb21tZW5kZWQuICAgICAgICAgICAgICAgICAgICB8CnxHb29kCSAgICAgfCAgODMtODYJIHwgU3VpdGFibGUgZm9yIGV2ZXJ5ZGF5IGNvbnN1bXB0aW9uOyBvZnRlbiBnb29kIHZhbHVlLiAgIHwKfEFjY2VwdGFibGV8CTgwLTgyCSB8IENhbiBiZSBlbXBsb3llZCBpbiBjYXN1YWwsIGxlc3MtY3JpdGljYWwgY2lyY3Vtc3RhbmNlcyB8CgpgYGB7cn0KIyBmdW5jdGlvbiB0byBhZGQgcmF0aW5nCnJhdGluZ19jYXRlZ29yeSA8LSBmdW5jdGlvbihwb2ludHMpewogIGlmKHBvaW50cz49OTgpewogICAgcmV0dXJuKCJDbGFzc2ljIikKICB9CiAgZWxzZSBpZiAocG9pbnRzPj05NCl7CiAgICByZXR1cm4oIlN1cGVyYiIpCiAgfQogIGVsc2UgaWYocG9pbnRzPj05MCl7CiAgICByZXR1cm4oIkV4Y2VsbGVudCIpCiAgfQogIGVsc2UgaWYocG9pbnRzPj04Nyl7CiAgICByZXR1cm4oIlZlcnkgR29vZCIpCiAgfQogIGVsc2UgaWYocG9pbnRzPj04Myl7CiAgICByZXR1cm4oIkdvb2QiKQogIH0KICBlbHNlewogICAgcmV0dXJuKCJBY2NlcHRhYmxlIikKICB9Cn0KCndpbmVzPC0gd2luZXMgJT4lCiAgcm93d2lzZSgpICU+JQogIG11dGF0ZShyYXRpbmdfY2F0ZWdvcnkgPSByYXRpbmdfY2F0ZWdvcnkocG9pbnRzKSkKaGVhZCh3aW5lcykKYGBgCgojIyBBZGQgQWRqdXN0ZWQgUG9pbnRzCgpTaW5jZSwgZWFjaCByZXZpZXdlciBoYXMgYSBkaWZmZXJlbnQgYmlhcyB3ZSBjcmVhdGVkIGEgbm9ybWFsaXplZCBtZXRyaWMsIGBub3JtX3BvaW50c2AsIGJ5IGxvb2tpbmcgYXQgdGhlIG51bWJlciBvZiBzdGFuZGFyZCBkZXZpYXRpb2lucyBhIHdpbmUgaXMgZnJvbSB0aGUgcmV2aWV3ZXIncyBgYXZnX3BvaW50c2AuIFRoaXMgZ2l2ZXMgdXNlIGEgbW9yZSBhY2N1cmF0ZSByZXByZXNlbnRhdGlvbiBvZiB3aGljaCB3aGljaCB3aW5lcyBhcmUgYmV0dGVyIHRoYW4gdGhlIHJlc3QuCgpgYGB7cn0Kbm9ybWFsaXplX3BvaW50cyA8LSBmdW5jdGlvbihkYXRhKXsKICBsZWZ0X2pvaW4oZGF0YSwgdGFzdGVycywgYnkgPSAidGFzdGVyX25hbWUiKSU+JQogICAgcm93d2lzZSgpICU+JQogICAgbXV0YXRlKG5vcm1fcG9pbnRzID0gKHBvaW50cy1hdmdfcG9pbnRzKS9zZF9wb2ludHMpICU+JQogICAgc2VsZWN0KC1hdmdfcG9pbnRzLCAtc2RfcG9pbnRzLCAtdmFyX3BvaW50cywgLXRhc3Rlcl90d2l0dGVyX2hhbmRsZSwgLXJldmlld3MpCn0KCndpbmVzIDwtIG5vcm1hbGl6ZV9wb2ludHMod2luZXMpCmhlYWQod2luZXMpIApgYGAKCiMjIERhdGEgU2FuaXRhdGlvbgoKVmludGFnZSBzZWVtcyB0byBoYXZlIHllYXIgNzIwMApgYGAge3J9CndpbmVzIDwtIHdpbmVzICU+JQogIGZpbHRlcih2aW50YWdlPDIwMTkpCmBgYAojIERhdGEgRXhwbG9yYXRpb24KCiMjIFVuaXZhcmlhdGUgRXhwbG9yYXRpb24KQ29ycmVsYXRpb24gYHByaWNlYCBieSBgcG9pbnRzYCwgdXNpbmcgYGBgRGF0YUV4cGxvcmVyYGBgIGxpYnJhcnkgd2hpY2ggY2FuIGJlIGZvdW5kIFtoZXJlXShodHRwczovL2RhdGFzY2llbmNlcGx1cy5jb20vYmxhemluZy1mYXN0LWVkYS1pbi1yLXdpdGgtZGF0YWV4cGxvcmVyLykKYGBge3J9CiMgVE9ETzogSVpaWQpgYGAKCiMjIyBBbGNvaG9sIEFtb3VudApgYGB7cn0KIyBUT0RPOiBJWlpZCmBgYAoKIyMjIENhdGVnb3J5CmBgYHtyfQojIFRPRE86IElaWlkKYGBgCgojIyMgVmludGFnZQpDb3VudCB3aW5lcyBwZXIgeWVhciAoTm90ZTogRGF0YSBoYXMgYmVlbiBzYW5pdGl6ZWQpCmBgYHtyfQp3aW5lcyAlPiUKICBncm91cF9ieSh2aW50YWdlKSAlPiUKICBzdW1tYXJpemUoY291bnQgPSBuKCkpCmBgYAoKCmBgYHtyfQp3aW5lcyAlPiUKICBnZ3Bsb3QoKSArCiAgZ2VvbV9iYXIobWFwcGluZyA9IGFlcyh4PXZpbnRhZ2UpKSArCiAgCmBgYAoKIyMjIFdpbmVyeQpgYGB7cn0KIyBUT0RPOiBPc2FraQpgYGAKCiMjIyBQcm92aW5jZQpgYGB7cn0KIyBUT0RPOiBPU0FLSQpgYGAKCiMjIyBQcmljZQpgYGB7cn0KIyBUT0RPOiBPQVNLSSAoVGhpcyBpcyBub3QgcHJvZHVjaW5nIGNvcnJlY3QgcmVzdWx0cykKd2luZXMgJT4lCiAgICBzdW1tYXJpemUoYXZnX3ByaWNlID0gbWVhbihwcmljZSwgbmEucm09VFJVRSksIAogICAgICAgICAgICAgIHNkX3ByaWNlID0gc2QocHJpY2UsIG5hLnJtPVRSVUUpLAogICAgICAgICAgICAgIGxvd2VzdF9wcmljZSA9IG1pbihwcmljZSwgbmEucm09VFJVRSksCiAgICAgICAgICAgICAgaGlnaGVzdF9wcmljZSA9IG1heChwcmljZSxuYS5ybT1UUlVFKSkKYGBgCgojIyMgUG9pbnRzCmBgYHtyfQojIFRPRE86IE9BU0tJIChUaGlzIGlzIG5vdCBwcm9kdWNpbmcgY29ycmVjdCByZXN1bHRzKQp3aW5lcyAlPiUKICAgIHN1bW1hcml6ZShhdmdfcG9pbnRzID0gbWVhbihwb2ludHMsIG5hLnJtPVRSVUUpLCAKICAgICAgICAgICAgICBzZF9wb2ludHMgPSBzZChwb2ludHMsIG5hLnJtPVRSVUUpLAogICAgICAgICAgICAgIGxvd2VzdF9wb2ludHMgPSBtaW4ocG9pbnRzLCBuYS5ybT1UUlVFKSwKICAgICAgICAgICAgICBoaWdoZXN0X3BvaW50cyA9IG1heChwb2ludHMsbmEucm09VFJVRSkpCmBgYAoKUG9pbnRzIGRpc3RyaWJ1dGlvbiBieSBSZXZpZXdlcgpgYGB7cn0Kd2luZXMgJT4lCiAgZ2dwbG90KCkgKwogIGdlb21fYm94cGxvdChhZXMoeT10YXN0ZXJfbmFtZSwgeD1wb2ludHMpKSArCiAgZ2VvbV92bGluZSh4aW50ZXJjZXB0ID0gbWVhbih3aW5lcyRwb2ludHMpKQpgYGAKCiMjIE11bHRpdmFyaWF0ZSBFeHBsb3JhdGlvbgoKIyMgUHJpY2UgYnkgUG9pbnRzCk5vdGljZSB0aGUgZGF0YSBpcyAic3RhY2tlZCIgYW5kIHRoZSBzb2NyZXMgcmFuZ2UgZnJvbSA4MC0xMDAKYGBge3J9CndpbmVzICU+JSAKICBnZ3Bsb3QoKSArCiAgZ2VvbV9wb2ludChtYXBwaW5nID0gKGFlcyh4ID0gcG9pbnRzLCB5ID0gcHJpY2UpKSwgbmEucm0gPSBULCBhbHBoYSA9IDAuMTUpICsKICBsYWJzKHRpdGxlID0gIlByaWNlIGJ5IFBvaW50cyIsIHggPSAiUG9pbnRzIiwgeSA9ICJQcmljZSIpCmBgYAoKVE9ETzogSVpaWSAoV2h5IGRpZCB3ZSBsb2cgdGhpcz8pCgpgYGB7cn0Kd2luZXMgJT4lIAogIGdncGxvdCgpICsKICBnZW9tX3BvaW50KG1hcHBpbmcgPSAoYWVzKHggPSBwb2ludHMsIHkgPSBsb2cocHJpY2UpKSksIG5hLnJtID0gVCwgYWxwaGEgPSAwLjE1KSArCiAgbGFicyh0aXRsZSA9ICJsb2coUHJpY2UpIGJ5IFBvaW50cyIsIHggPSAiUG9pbnRzIiwgeSA9ICJsb2coUHJpY2UpIikKYGBgCgojIERhdGEgQW5hbHlzaXMKCiNGaW5kIHRoZSBiZXN0IHByb3ZpbmNlIGZvciB3aW5lIHVzaW5nIHRoZSBhdmVyYWdlIHBvaW50cyBhY3Jvc3MgdGhlIDEsMDAwIHNhbXBsZXMKI2Ryb3AgdGhlIGRlc2NyaXB0aW9ucyBvciBqdXN0IHNlbGVjdCBwcmljZT8gc2V0IHBvaW50cyB0byBtYXgocG9pbnRzKQpgYGB7cn0KYmVzdF9wcm92aW5jZSA8LSB3aW5lX3NhbXBsZSAlPiUgCiAgZ3JvdXBfYnkocHJvdmluY2UsIHBvaW50cykgJT4lIAogIGZpbHRlcihwb2ludHMgPiA4OC42NjkpCmJlc3RfcHJvdmluY2UgIApgYGAKCgpCZXN0IHdpbmUsIGJ5IHZhcmlldHkKYGBge3J9CiN3aW5lX2Jlc3RfdmFyaWV0eSA8LSAKd2luZXMgJT4lIAogIGdyb3VwX2J5KHZhcmlldHkpICU+JSAKICBzdW1tYXJpc2UobWVhbl9wb2ludHMgPSBtZWFuKHBvaW50cykpICU+JSAKICBhcnJhbmdlKGRlc2MobWVhbl9wb2ludHMpKSAKICAKYGBgCgpgYGB7cn0KdXNlcl9wcmljZSA8LSByZWFkbGluZShwcm9tcHQgPSAiSG93IG11Y2ggYXJlIHlvdSB3aWxsaW5nIHRvIHNwZW5kIG9uIGEgYm90dGxlPyIpCnVzZXJfcHJpY2UgPC0gYXMuaW50ZWdlcih1c2VyX3ByaWNlKQoKd2luZXMgJT4lIAogIGZpbHRlcihwcmljZSA8PSB1c2VyX3ByaWNlKSAlPiUgCiAgYXJyYW5nZShkZXNjKHBvaW50cykpICU+JSAKICBzZWxlY3QodGl0bGUsIHByaWNlLCBwb2ludHMpCmBgYAoKCiMgQ29uY2x1c2lvbgo=